# Corpus data from two different heuristics
corpus.inter <- read.csv("../data/extreme_intersection.csv")
corpus.inter$DataSource <- "human_norming_intersective"
corpus.non.inter <- read.csv("../data/extreme.csv")
corpus.non.inter$DataSource <- "human_norming"
# ChatGPT choosing the words from the "complete" word list
gpt.inter.val <- read.csv("../data/chatgpt_extreme-intersection_valence.txt", sep="\t")
gpt.inter.val$DataSource <- "ChatGPT_extreme_inter"
gpt.inter.conc <- read.csv("../data/chatgpt_extreme-intersection_concreteness.txt", sep="\t")
gpt.inter.conc$DataSource <- "ChatGPT_extreme_inter"
# ChatGPT choosing the words from the "complete" word list
gpt.non.inter.val <- read.csv("../data/chatgpt_extreme_valence.txt", sep="\t")
gpt.non.inter.val$DataSource <- "ChatGPT_extreme"
gpt.non.inter.conc <- read.csv("../data/chatgpt_extreme_concrete.txt", sep="\t")
gpt.non.inter.conc$DataSource <- "ChatGPT_extreme"
# ChatGPT choosing the words from the "complete" word list
gpt.total.conc <- read.csv("../data/GPT_40_conc.txt", sep="\t")
gpt.total.conc$DataSource <- "ChatGPT_total"
names(gpt.total.conc)[names(gpt.total.conc) == "Estimated...Concrete"] <- "Concrete.."
names(gpt.total.conc)[names(gpt.total.conc) == "Estimated...Abstract"] <- "Abstract.."
gpt.total.val <- read.csv("../data/GPT_40_val.txt", sep="\t")
gpt.total.val$DataSource <- "ChatGPT_total"
names(gpt.total.val)[names(gpt.total.val) == "Estimated...Positive"] <- "Positive.."
names(gpt.total.val)[names(gpt.total.val) == "Estimated...Negative"] <- "Negative.."
# For Concreteness
# Define the function to process a single data frame and create a new data frame with .prop added to its name
process_and_rename_conc_dfs <- function(df_list) {
# Iterate over the list of data frames
for (df_name in names(df_list)) {
# Process the data frame
processed_df <- df_list[[df_name]] %>%
mutate(
Concrete.. = as.numeric(gsub("%", "", Concrete..)), # Remove % and convert to numeric
PropConcrete = Concrete.. / 100 # Convert to proportion
) %>%
select(-c(Concrete.., Abstract..)) # Drop the unwanted columns
# Dynamically assign the new data frame with ".prop" added to the name
new_name <- paste0(df_name, ".prop")
assign(new_name, processed_df, envir = .GlobalEnv) # Assign it to the global environment
}
}
# Example usage
conc_list <- list(
gpt.total.conc = gpt.total.conc,
gpt.inter.conc = gpt.inter.conc,
gpt.non.inter.conc = gpt.non.inter.conc
) # Replace with actual data frames
process_and_rename_conc_dfs(conc_list)
# Now you should have gpt.pilot.conc.prop, df2.prop, and df3.prop created in the environment.
# For Valence
# Define the function to process a single data frame and create a new data frame with .prop added to its name
process_and_rename_val_dfs <- function(df_list) {
# Iterate over the list of data frames
for (df_name in names(df_list)) {
# Process the data frame
processed_df <- df_list[[df_name]] %>%
mutate(
Positive.. = as.numeric(gsub("%", "", Positive..)), # Remove % and convert to numeric
PropPositive = Positive.. / 100 # Convert to proportion
) %>%
select(-c(Positive.., Negative..)) # Drop the unwanted columns
# Dynamically assign the new data frame with ".prop" added to the name
new_name <- paste0(df_name, ".prop")
assign(new_name, processed_df, envir = .GlobalEnv) # Assign it to the global environment
}
}
val_list <- list(
gpt.total.val = gpt.total.val,
gpt.inter.val = gpt.inter.val,
gpt.non.inter.val = gpt.non.inter.val
) # Replace with actual data frames
process_and_rename_val_dfs(val_list)
# Now you should have gpt.pilot.conc.prop, df2.prop, and df3.prop created in the environment.
corpus.inter.prop <- corpus.inter %>%
mutate(PropPositive = V.Mean.Sum / 9,
PropConcrete = Conc.M / 5) %>%
select(Word, PropPositive, PropConcrete, DataSource)
corpus.non.inter.prop <- corpus.non.inter %>%
mutate(PropPositive = V.Mean.Sum / 9,
PropConcrete = Conc.M / 5) %>%
select(Word, PropPositive, PropConcrete, DataSource)
# Combine the dfs
inter.val.total = rbind(corpus.inter.prop[,names(corpus.inter.prop) != "PropConcrete"],gpt.inter.val.prop)
non.inter.val.total = rbind(corpus.non.inter.prop[,names(corpus.non.inter.prop) != "PropConcrete"],gpt.non.inter.val.prop)
inter.conc.total = rbind(corpus.inter.prop[,names(corpus.inter.prop) != "PropPositive"],gpt.inter.conc.prop)
non.inter.conc.total = rbind(corpus.non.inter.prop[,names(corpus.non.inter.prop) != "PropPositive"],gpt.non.inter.conc.prop)
is one list or the other more extreme?
Summary of Techniques:
To assess how close proportions are to extremes (0 or 1), you can use the logit transformation. This transformation converts a proportion to a scale that stretches out the extremes, making them more comparable to central values.
The logit transformation is defined as: logit(p)=log(p/(1-p))
Where:
- pp is the proportion (between 0 and 1)
- The logit will be close to −∞−∞ when pp is near 0, and close to ∞∞ when pp is near 1.
# Apply logit transformation
# First for Valence
corpus.inter.prop$LogOddsPos <- logit(corpus.inter.prop$PropPositive)
corpus.non.inter.prop$LogOddsPos <- logit(corpus.non.inter.prop$PropPositive)
# First for Concreteness
corpus.inter.prop$LogOddsConc <- logit(corpus.inter.prop$PropConcrete)
corpus.non.inter.prop$LogOddsConc <- logit(corpus.non.inter.prop$PropConcrete)
log.odds <- rbind(corpus.inter.prop,corpus.non.inter.prop)[,c("Word","DataSource","LogOddsPos","LogOddsConc")]
# Create the log-odds plot
ggplot(log.odds, aes(x = DataSource, y = LogOddsPos)) +
geom_point(size = 4) + # Scatter plot for log-odds
geom_line(group = 1) + # Add a line connecting the points
labs(title = "Log-Odds (Logit) of ProportionPostive by heuristic",
y = "Log-Odds (Logit)",
x = "Data Source (heuristic)")
# Create the log-odds plot
ggplot(log.odds, aes(x = DataSource, y = LogOddsConc)) +
geom_point(size = 4) + # Scatter plot for log-odds
geom_line(group = 1) + # Add a line connecting the points
labs(title = "Log-Odds (Logit) of ProportionConcrete by heuristic",
y = "Log-Odds (Logit)",
x = "Data Source (heuristic)")
Another simple approach is to compute the distance from the extremes (0 and 1). This is done by measuring the absolute difference between a proportion and the closest extreme.
For a proportion pp:
# Example proportions
prop <- c(0.01, 0.05, 0.5, 0.95, 0.99)
# Distance from closest extreme (0 or 1)
distance_from_extreme <- pmin(prop, 1 - prop)
distance_from_extreme
## [1] 0.01 0.05 0.50 0.05 0.01
# First for Valence
corpus.inter.prop$DistExtremePos <- pmin(corpus.inter.prop$PropPositive, 1 - corpus.inter.prop$PropPositive)
corpus.non.inter.prop$DistExtremePos <- pmin(corpus.non.inter.prop$PropPositive, 1 - corpus.non.inter.prop$PropPositive)
# First for Concreteness
corpus.inter.prop$DistExtremeConc <- pmin(corpus.inter.prop$PropPositive, 1 - corpus.inter.prop$PropPositive)
corpus.non.inter.prop$DistExtremeConc <- pmin(corpus.non.inter.prop$PropPositive, 1 - corpus.non.inter.prop$PropPositive)
dist.extremes <- rbind(corpus.inter.prop,corpus.non.inter.prop)[,c("Word","DataSource","DistExtremePos","DistExtremeConc")]
agr <- dist.extremes %>%
group_by(DataSource) %>%
summarize(MeanDistExtremePos = mean(DistExtremePos))
# Plot the distance from extremes
ggplot(agr, aes(x = DataSource, y = MeanDistExtremePos, fill=DataSource)) +
geom_bar(stat = "identity") +
geom_text(aes(label = round(MeanDistExtremePos, 2)), vjust = -0.5) +
labs(title = "Mean Distance from Extremes (0 or 1) for Valence",
y = "Distance from Extreme (0 or 1)",
x = "Category") +
# theme_minimal() +
# theme(axis.text.x = element_text(angle = 45, hjust = 1))
guides(fill = "none")
# Plot the distance from extremes
ggplot(dist.extremes, aes(x = Word, y = DistExtremePos, fill=DataSource)) +
geom_bar(stat = "identity") +
# geom_text(aes(label = round(DistExtremePos, 2)), vjust = -0.5) +
labs(title = "Distance from Extremes (0 or 1) for Valence",
y = "Distance from Extreme (0 or 1)",
x = "Category") +
# theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# guides(fill = "none")
agr <- dist.extremes %>%
group_by(DataSource) %>%
summarize(MeanDistExtremeConc = mean(DistExtremeConc))
# Plot the distance from extremes
ggplot(agr, aes(x = DataSource, y = MeanDistExtremeConc, fill=DataSource)) +
geom_bar(stat = "identity") +
geom_text(aes(label = round(MeanDistExtremeConc, 2)), vjust = -0.5) +
labs(title = "Mean Distance from Extremes (0 or 1) for Concrete",
y = "Distance from Extreme (0 or 1)",
x = "Category") +
# theme_minimal() +
# theme(axis.text.x = element_text(angle = 45, hjust = 1))
guides(fill = "none")
# Plot the distance from extremes
ggplot(dist.extremes, aes(x = Word, y = DistExtremeConc, fill=DataSource)) +
geom_bar(stat = "identity") +
# geom_text(aes(label = round(DistExtremeConc, 2)), vjust = -0.5) +
labs(title = "Distance from Extremes (0 or 1) for Concrete",
y = "Distance from Extreme (0 or 1)",
x = "Category") +
# theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# guides(fill = "none")
If you have a collection of proportions and want to model how close they are to the extremes, you can fit a Beta distribution, which is commonly used to model proportions.
The Beta distribution has two parameters αα and ββ that control the shape of the distribution. If the values of αα and ββ are low (e.g., both less than 1), it indicates that the proportions are closer to 0 or 1.
# Fit Beta distributions for valence
fit.inter.val <- fitdistr(corpus.inter.prop$PropPositive, dbeta, start = list(shape1 = 1, shape2 = 1))
fit.non.inter.val <- fitdistr(corpus.non.inter.prop$PropPositive, dbeta, start = list(shape1 = 1, shape2 = 1))
fit.inter.val
## shape1 shape2
## 2.1455144 1.7718904
## (0.4615893) (0.3724176)
fit.non.inter.val
## shape1 shape2
## 1.9264999 1.5991460
## (0.4131392) (0.3345259)
# Fit Beta distributions for valence
fit.inter.conc <- fitdistr(corpus.inter.prop$PropConcrete, dbeta, start = list(shape1 = 1, shape2 = 1))
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
fit.non.inter.con <- fitdistr(corpus.non.inter.prop$PropConcrete, dbeta, start = list(shape1 = 1, shape2 = 1))
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
fit.inter.conc
## shape1 shape2
## 1.5020517 0.9091160
## (0.3290508) (0.1816138)
fit.non.inter.con
## shape1 shape2
## 1.5913480 0.9674368
## (0.3485365) (0.1941128)
# Fit beta distribution for dataset 1
fit1 <- fitdistr(corpus.inter.prop$PropPositive, dbeta, start = list(shape1 = 1, shape2 = 1))
alpha1 <- fit1$estimate["shape1"]
beta1 <- fit1$estimate["shape2"]
# Fit beta distribution for dataset 2
fit2 <- fitdistr(corpus.non.inter.prop$PropPositive, dbeta, start = list(shape1 = 1, shape2 = 1))
alpha2 <- fit2$estimate["shape1"]
beta2 <- fit2$estimate["shape2"]
# Print fitted alpha and beta for both datasets
print(paste("Intersective: alpha =", alpha1, "beta =", beta1))
## [1] "Intersective: alpha = 2.14551440425017 beta = 1.77189042737386"
print(paste("Non-Intersective: alpha =", alpha2, "beta =", beta2))
## [1] "Non-Intersective: alpha = 1.92649993432997 beta = 1.59914596166821"
# Compare AIC values (lower AIC indicates a better fit)
AIC1 <- fit1$loglik * -2 + 2 * 2 # AIC for dataset 1
AIC2 <- fit2$loglik * -2 + 2 * 2 # AIC for dataset 2
print(paste("AIC for Intersective:", AIC1))
## [1] "AIC for Intersective: -6.21133465768471"
print(paste("AIC for Non-Intersective:", AIC2))
## [1] "AIC for Non-Intersective: -3.7847393139852"
# Create a sequence of values from 0 to 1 for plotting
x <- seq(0, 1, length.out = 100)
# Calculate beta densities for both datasets
y1 <- dbeta(x, shape1 = alpha1, shape2 = beta1)
y2 <- dbeta(x, shape1 = alpha2, shape2 = beta2)
# Create dataframes for plotting
df1 <- data.frame(x = x, y = y1, Dataset = "Intersective")
df2 <- data.frame(x = x, y = y2, Dataset = "Non-Intersective")
# Combine the dataframes
df_combined <- rbind(df1, df2)
# Plot the beta distributions for both datasets
ggplot(df_combined, aes(x = x, y = y, color = Dataset)) +
geom_line(size = 1.2) +
labs(title = "Comparison of Beta Distribution Fits for Valence",
x = "Proportion",
y = "Density")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# theme_minimal() +
# scale_color_manual(values = c("blue", "red"))
# Fit beta distribution for dataset 1
fit1 <- fitdistr(corpus.inter.prop$PropConcrete, dbeta, start = list(shape1 = 1, shape2 = 1))
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
alpha1 <- fit1$estimate["shape1"]
beta1 <- fit1$estimate["shape2"]
# Fit beta distribution for dataset 2
fit2 <- fitdistr(corpus.non.inter.prop$PropConcrete, dbeta, start = list(shape1 = 1, shape2 = 1))
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
alpha2 <- fit2$estimate["shape1"]
beta2 <- fit2$estimate["shape2"]
# Print fitted alpha and beta for both datasets
print(paste("Intersective: alpha =", alpha1, "beta =", beta1))
## [1] "Intersective: alpha = 1.50205166474088 beta = 0.9091159725851"
print(paste("Non-Intersective: alpha =", alpha2, "beta =", beta2))
## [1] "Non-Intersective: alpha = 1.59134800730586 beta = 0.967436845938341"
# Compare AIC values (lower AIC indicates a better fit)
AIC1 <- fit1$loglik * -2 + 2 * 2 # AIC for dataset 1
AIC2 <- fit2$loglik * -2 + 2 * 2 # AIC for dataset 2
print(paste("AIC for Intersective:", AIC1))
## [1] "AIC for Intersective: -4.06524650335086"
print(paste("AIC for Non-Intersective:", AIC2))
## [1] "AIC for Non-Intersective: -4.21931277561189"
# Create a sequence of values from 0 to 1 for plotting
x <- seq(0, 1, length.out = 100)
# Calculate beta densities for both datasets
y1 <- dbeta(x, shape1 = alpha1, shape2 = beta1)
y2 <- dbeta(x, shape1 = alpha2, shape2 = beta2)
# Create dataframes for plotting
df1 <- data.frame(x = x, y = y1, Dataset = "Intersective")
df2 <- data.frame(x = x, y = y2, Dataset = "Non-Intersective")
# Combine the dataframes
df_combined <- rbind(df1, df2)
# Plot the beta distributions for both datasets
ggplot(df_combined, aes(x = x, y = y, color = Dataset)) +
geom_line(size = 1.2) +
labs(title = "Comparison of Beta Distribution Fits for Concreteness",
x = "Proportion",
y = "Density")
# theme_minimal() +
# scale_color_manual(values = c("blue", "red"))
You can compute the variance of proportions to get a sense of how spread out they are. High variance in proportions usually indicates that some values are close to the extremes (0 or 1).
The formula for the variance of proportions is: Var(p)=p(1−p) Var(p)=p(1−p)
In R, you can calculate the variance for individual proportions or the overall variance for a set of proportions:
# Calculate the variance of each proportion
corpus.inter.prop$PropVariancePos <- corpus.inter.prop$PropPositive * (1 - corpus.inter.prop$PropPositive)
corpus.non.inter.prop$PropVariancePos <- corpus.non.inter.prop$PropPositive * (1 - corpus.non.inter.prop$PropPositive)
prop.var.val <- rbind(corpus.inter.prop,corpus.non.inter.prop)[,c("Word","DataSource","PropPositive","PropVariancePos")]
corpus.inter.prop$PropVarianceConc <- corpus.inter.prop$PropConcrete * (1 - corpus.inter.prop$PropConcrete)
corpus.non.inter.prop$PropVarianceConc <- corpus.non.inter.prop$PropConcrete * (1 - corpus.non.inter.prop$PropConcrete)
prop.var.conc <- rbind(corpus.inter.prop,corpus.non.inter.prop)[,c("Word","DataSource","PropConcrete","PropVarianceConc")]
prop.var.total <- inner_join(prop.var.val, prop.var.conc, by = c("Word", "DataSource"))
# # Melt the specified columns
# melted_data <- prop.var.total %>%
# mutate(
# # Calculate Proportion and Variance
# Proportion_Positive = PropPositive / PropConcrete,
# Variance_Positive = PropVariancePos / PropVarianceConc
# ) %>%
# select(Word, DataSource, Proportion_Positive, Variance_Positive) %>%
# pivot_longer(
# cols = c(Proportion_Positive, Variance_Positive),
# names_to = c("Measure", ".value"),
# names_pattern = "^(.*)$"
# ) %>%
# mutate(
# Measure = "Positive" # All values from PropPositive belong to "Positive"
# ) %>%
# # Create a second part for Concrete
# bind_rows(data %>%
# mutate(
# Proportion_Concrete = PropConcrete,
# Variance_Concrete = PropVarianceConc
# ) %>%
# select(Word, DataSource, Proportion_Concrete, Variance_Concrete) %>%
# pivot_longer(
# cols = c(Proportion_Concrete, Variance_Concrete),
# names_to = c("Measure", ".value"),
# names_pattern = "^(.*)$"
# ) %>%
# mutate(
# Measure = "Concrete" # All values from PropConcrete belong to "Concrete"
# )
# )
# Plot the variance of proportions for both datasets
ggplot(prop.var.val, aes(x = Word, y = PropVariancePos, fill = DataSource)) +
geom_bar(stat = "identity") +
labs(title = "Variance of Proportions Valence",
x = "Word",
y = "Variance") +
# theme_minimal() +
# scale_fill_manual(values = c("blue", "red")) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Plot the variance of proportions for both datasets
ggplot(prop.var.val, aes(x = PropPositive, y = PropVariancePos, color = DataSource)) +
geom_point(size = 3) +
geom_line(size = 1) +
labs(title = "Variance of Proportions Valence",
x = "Proportion",
y = "Variance")
# theme_minimal() +
# scale_color_manual(values = c("blue", "red"))
# Plot the variance of proportions for both datasets
ggplot(prop.var.conc, aes(x = Word, y = PropVarianceConc, fill = DataSource)) +
geom_bar(stat = "identity") +
labs(title = "Variance of Proportions Concrete",
x = "Word",
y = "Variance") +
# theme_minimal() +
# scale_fill_manual(values = c("blue", "red")) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Plot the variance of proportions for both datasets
ggplot(prop.var.conc, aes(x = PropConcrete, y = PropVarianceConc, color = DataSource)) +
geom_point(size = 3) +
geom_line(size = 1) +
labs(title = "Variance of Proportions Concrete",
x = "Proportion",
y = "Variance")
# theme_minimal() +
# scale_color_manual(values = c("blue", "red"))
Between the two heuristics for choosing extreme examples, does Chat GPT agree with one more than the other?
dodge = position_dodge(.9)
ggplot(data=gpt.total.val.prop, aes(x=Word,y=PropPositive,fill=Word)) +
geom_bar(position=dodge,stat="identity") +
# facet_wrap(~Word,ncol=5) +
# theme(axis.text.x = element_blank(), # Remove x-axis labels
# axis.title.x = element_blank()) # Remove x-axis title
ggtitle("ChatGPT Valence Estimates from Total List") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
guides(fill = "none")
dodge = position_dodge(.9)
ggplot(data=gpt.total.conc.prop, aes(x=Word,y=PropConcrete,fill=Word)) +
geom_bar(position=dodge,stat="identity") +
# facet_wrap(~Word,ncol=5) +
# theme(axis.text.x = element_blank(), # Remove x-axis labels
# axis.title.x = element_blank()) # Remove x-axis title
ggtitle("ChatGPT Concreteness Estimates from Total List") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
guides(fill = "none")
These two lists have a lot of agreement
X = corpus.inter$Word
Y = corpus.non.inter$Word
# Check for agreement between 'Response' columns
agreement <- X == Y
# Proportion of exact matches
prop_agreement <- mean(agreement)
print(prop_agreement)
## [1] 0.575
# Cohen's Kappa between two categorical columns
kappa_result <- kappa2(data.frame(X, Y))
print(kappa_result)
## Cohen's Kappa for 2 Raters (Weights: unweighted)
##
## Subjects = 40
## Raters = 2
## Kappa = 0.566
##
## z = 24.8
## p-value = 0
# The output will provide a kappa statistic (value), where:
#
# 0.81 - 1.00 = Almost perfect agreement
# 0.61 - 0.80 = Substantial agreement
# 0.41 - 0.60 = Moderate agreement
# 0.21 - 0.40 = Fair agreement
# 0.00 - 0.20 = Slight agreement
# Find common values
common_values <- intersect(X, Y)
length(common_values)
## [1] 33
print(common_values)
## [1] "disown" "profane" "degrade" "displease" "intimidate"
## [6] "distrust" "kiss" "sing" "dance" "sleep"
## [11] "cook" "eat" "swing" "hope" "inspire"
## [16] "enlighten" "imagine" "believe" "fascinate" "zing"
## [21] "inspired" "know" "deserve" "vomit" "piss"
## [26] "slap" "spit" "barf" "puke" "tear"
## [31] "weep" "ditch" "bleed"
# Words in df1 but not in df2
diff <- setdiff(X, Y)
print(diff)
## [1] "deceive" "envy" "jeopardize" "dislike" "sail"
## [6] "skate" "swim"
Is one or other of the extreme heuristics more concordant with Chat GPT?
# Val + corpus inter
are_identical <- identical(gpt.total.val.prop$Word, corpus.inter$Word)
print(are_identical)
## [1] FALSE
# Find common values
common_values <- intersect(gpt.total.val.prop$Word, corpus.inter$Word)
length(common_values)
## [1] 2
print(common_values)
## [1] "inspire" "deceive"
# Val + corpus non-inter
are_identical <- identical(gpt.total.val.prop$Word, corpus.non.inter$Word)
print(are_identical)
## [1] FALSE
# Find common values
common_values <- intersect(gpt.total.val.prop$Word, corpus.non.inter$Word)
length(common_values)
## [1] 2
print(common_values)
## [1] "inspire" "hate"
print(gpt.total.val.prop$Word)
## [1] "inspire" "achieve" "love" "support" "celebrate"
## [6] "thrive" "adore" "bless" "empower" "enjoy"
## [11] "motivate" "succeed" "create" "nurture" "engage"
## [16] "facilitate" "help" "explore" "unite" "innovate"
## [21] "hate" "destroy" "hurt" "choke" "suffer"
## [26] "kill" "deceive" "abandon" "terrorize" "scold"
## [31] "insult" "ruin" "embarrass" "burden" "neglect"
## [36] "fail" "torment" "offend" "discourage" "reject"
print(gpt.total.conc.prop$Word)
## [1] "love" "inspire" "motivate" "empower" "support"
## [6] "achieve" "nurture" "celebrate" "engage" "explore"
## [11] "create" "facilitate" "unite" "thrive" "bless"
## [16] "adore" "succeed" "joy" "hope" "sorrow"
## [21] "trust" "kill" "destroy" "hurt" "scold"
## [26] "offend" "reject" "choke" "fail" "neglect"
## [31] "torment" "terrorize" "burden" "suffer" "ruin"
## [36] "deceive" "embarrass" "chase" "talk" "laugh"
## [41] "cry"
print(corpus.non.inter$Word)
## [1] "disown" "hate" "profane" "degrade" "displease"
## [6] "distrust" "annoy" "violate" "condemn" "intimidate"
## [11] "kiss" "sing" "laugh" "bathe" "sleep"
## [16] "dance" "smooch" "eat" "swing" "cook"
## [21] "hope" "inspire" "enlighten" "imagine" "believe"
## [26] "fascinate" "zing" "inspired" "know" "deserve"
## [31] "vomit" "piss" "slap" "spit" "barf"
## [36] "puke" "tear" "weep" "ditch" "bleed"
print(corpus.inter$Word)
## [1] "disown" "profane" "degrade" "displease" "intimidate"
## [6] "deceive" "distrust" "envy" "jeopardize" "dislike"
## [11] "kiss" "sing" "dance" "sleep" "cook"
## [16] "eat" "swing" "sail" "skate" "swim"
## [21] "hope" "inspire" "enlighten" "imagine" "believe"
## [26] "fascinate" "zing" "inspired" "know" "deserve"
## [31] "vomit" "piss" "slap" "spit" "barf"
## [36] "puke" "tear" "weep" "ditch" "bleed"
Is one or other of the extreme heuristics more concordant with Chat GPT?
# Val + corpus inter
are_identical <- identical(gpt.total.conc.prop$Word, corpus.inter$Word)
print(are_identical)
## [1] FALSE
# Find common values
common_values <- intersect(gpt.total.conc.prop$Word, corpus.inter$Word)
length(common_values)
## [1] 3
print(common_values)
## [1] "inspire" "hope" "deceive"
# Val + corpus non-inter
are_identical <- identical(gpt.total.conc.prop$Word, corpus.non.inter$Word)
print(are_identical)
## [1] FALSE
# Find common values
common_values <- intersect(gpt.total.conc.prop$Word, corpus.non.inter$Word)
length(common_values)
## [1] 3
print(common_values)
## [1] "inspire" "hope" "laugh"
actually this needs redone because the DFs aren’t ordered in the right way to make the comparison legit
# get the data correct format
# Pivot the DataFrame to wide format
inter.val.total.wide <- inter.val.total %>%
pivot_wider(
names_from = DataSource,
values_from = PropPositive,
names_prefix = "", # Remove default prefix
names_glue = "{.value}_{DataSource}" # Use custom naming for new columns
)
# head(inter.val.total.wide)
# Rename columns for clarity
inter.val.total.wide <- inter.val.total.wide %>%
rename(
ChatGPT_PropPos = PropPositive_ChatGPT_extreme_inter,
Norming_PropPos = PropPositive_human_norming_intersective
)